InĀ [1]:
import numpy as np
from scipy.stats import kendalltau
from sklearn.neighbors import KDTree as KDTree_whole
from scipy.spatial import KDTree as KDTree_layer
from sklearn.decomposition import NMF
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import random
from scipy import stats
from scipy.stats import hypergeom
InĀ [2]:
from pyannotate_runtime import collect_types
def rundifferential_test_AUC(df, metaClusterLabel, control, treatment, regulon_name, threshold_file, alpha = .05): """ Add a column to mark successful cells (greater than the mean enrichment score) M is the total number of objects, (all the cells = total population (low and high regulon activity) n is total number of Type I objects. (all cells that pass threshold) The random variate represents the number of Type I objects in N drawn without replacement from the total population.
k is the active amount of cells we are interested in
N the anumberof cells you are planning to sample from the tissue
sf(k, M, n, N, loc=0)
"""
#print(df.columns)
if threshold_file is None:
calculate_mean = df[regulon_name].mean()
df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > df[regulon_name].mean()
else:
print("found file!")
threshold = returnThreshold_dictionary(threshold_file, 'threshold')
print(threshold)
df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > threshold[regulon_name]
# Count successful control cells
control_successful_cells = df[(df[metaClusterLabel] == control) & (df['is_successful_{}'.format(regulon_name)])].shape[0]
# Count successful treatment cells
treatment_successful_cells = df[(df[metaClusterLabel] == treatment) & (df['is_successful_{}'.format(regulon_name)])].shape[0]
total_successful_cells = df[df['is_successful_{}'.format(regulon_name)]].shape[0]
total_population = df.shape[0]
n_1 = df[df[metaClusterLabel] == control].shape[0]
n_2 = df[df[metaClusterLabel] == treatment].shape[0]
print(control_successful_cells - 1, total_population, total_successful_cells, n_1)
p_value_1 = hypergeom.sf(control_successful_cells - 1, total_population, total_successful_cells, n_1)
p_value_2 = hypergeom.sf(treatment_successful_cells -1, total_population, total_successful_cells, n_2)
return(regulon_name, round(p_value_1, 2), round(p_value_2, 2))
InĀ [20]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import math
def calculate_r2_and_coefficients(condx, condy):
"""Perform linear regression and return R² and coefficients."""
regr = LinearRegression()
regr.fit(condx, condy)
y_pred = regr.predict(condx)
rss = np.sum((condy - y_pred) ** 2)
tss = np.sum((condy - np.mean(condy)) ** 2)
r2 = r2_score(condy, y_pred)
return r2, rss, tss, regr.coef_
def check_r2_consistency(r2, rss, tss):
"""Check if the calculated R² is consistent with the TSS and RSS."""
return r2 == 1 - (rss / tss)
def calculate_metrics(r2, rss, tss, coef):
"""Calculate the final metrics."""
r = math.sqrt(r2) * np.sign(coef)
return round(r.item(),2), round(r2, 2), round(tss, 2), round(rss, 2)
def process_data_for_labels(labels, NMF_embedd):
"""Process the data for each label."""
func_sep = {}
for i in labels:
condx = NMF_embedd[i][0].reshape(-1, 1)
condy = NMF_embedd[i][1].reshape(-1, 1)
r2, rss, tss, coef = calculate_r2_and_coefficients(condx, condy)
if check_r2_consistency(r2, rss, tss):
print(i, " works")
r_metric, r2_metric, tss_metric, rss_metric = calculate_metrics(r2, rss, tss, coef)
func_sep[i] = (r_metric, r2_metric, tss_metric, rss_metric)
return func_sep
InĀ [21]:
class ComparisonTree:
""" cond1 (control), data (RSS matrix), colnames are everything but the rownames, rownames (these are the regulons) """
def __init__(self, cond1, ras_data, metaData_RAS_column,
rss_data, colnames, rownames, threshold_file):
self.cond1 = cond1
self.data = rss_data
self.RAS_AUC = ras_data
self.labels = colnames
self.sep = []
self.map_me = {}
self.NMF_embedd = {}
self.h_sep = {}
self.x = self.data[self.cond1].tolist()
self.label = list(map( lambda x: x.split(" ")[0],self.data[rownames].tolist())) ## rownames of your regulons
self.metaDataCol = metaData_RAS_column
self.threshold_file = threshold_file
self.order = []
self.condition_label = []
self.tau_p_value = {}
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query_ball_tree.html
def compareLayers(self, layer1, layer2, distance, font=6.5):
plt.figure(figsize=(6, 6))
cond1x, cond1y = self.NMF_embedd[layer1][0], self.NMF_embedd[layer1][1]
cond2x, cond2y = self.NMF_embedd[layer2][0], self.NMF_embedd[layer2][1] # you can compare multiple experiments here
plt.plot(cond1x, cond1y, "xk", markersize=10, label=layer1)
plt.plot(cond2x, cond2y, "og", markersize=10, label=layer2)
compare_layers = self.NMF_embedd[layer1][4].query_ball_tree(self.NMF_embedd[layer2][4], r=distance)
for i in range(len(compare_layers)):
for j in compare_layers[i]:
plt.plot([cond1x[i], cond2x[i]], [cond1y[i], cond2y[i]], "-b")
for i in range(len(self.NMF_embedd[layer1][0])):
plt.text(cond1x[i], cond1y[i],
self.label[i], fontsize=font, ha='right', color='red')
for i in range(len(self.NMF_embedd[layer2][0])):
plt.text(cond2x[i], cond2y[i], self.label[i], fontsize=font, ha='left', color='green')
plt.title("distance < {} between {} and {}".format(distance, layer1, layer2))
plt.xlabel("NMF_base ({})".format(layer1))
plt.ylabel("NMF_exp ({})".format(layer2))
plt.legend()
plt.savefig("Layer {} and {}".format(layer1, layer2))
# Show the plot
plt.show()
def calc_global_dictonary(self):
dict_label = list(np.array([self.label] * len(self.condition_label)).flatten())
dict_label = list(zip([self.cond1] * len(self.condition_label), self.condition_label, dict_label))
Idx_embedd = {}
i = 0
for ctrl_ref, cond, regulon in dict_label:
Idx_embedd[i] = ctrl_ref + "-" + cond + ":\t" + regulon
i += 1
return Idx_embedd # this is the global dictionary where we can query the entire 3D structure
def create_global_tree(self, leaf_size = 10): ## can be expensive
""" what if we want to compare the strcuture of one SCENIC run and do it with another? checking reproducibility"""
X, Y, Z = self.construct_3D_embedding()
Idx_embedd = self.calc_global_dictonary()
NMF_XYZ = np.column_stack(((X, Y, Z)))
entire_tree = KDTree_whole(NMF_XYZ, leaf_size) # query the entire tree structure! #scikit will leaarn the structure
return entire_tree, Idx_embedd
def compute_tau_and_kdtree(self):
"""Compute Kendall's Tau and KDTree for each label."""
for i in self.labels:
y = self.data[i].tolist()
tau, p_value = kendalltau(self.x, y)
self.tau_p_value[i] = (tau, p_value)
query_kdtree = KDTree_layer(np.column_stack((self.x, y)))
cond1x, condy, test1, h = self.nmf_transform(np.array(list(zip(self.x, y))))
self.h_sep[i] = h
self.NMF_embedd[i] = (cond1x, condy, test1, tau, query_kdtree)
self.sep.append(tau)
def return_the_tau_pval(self):
return self.tau_p_value
def analyze_factors(self, cond2, percentages = False):
H = self.h_sep[cond2]
def plotHeatmap(H):
sns.heatmap(H, annot=True, cmap="YlGnBu",
xticklabels=[self.cond1 + " (control)", cond2],
yticklabels=[f'NMF_{i+1}' for i in range(H.shape[0])])
plt.title('Percentages (H) with Vectors {} and {}'.format(self.cond1, cond2) if percentages
else 'Coefficient Matrix (H) with Vectors {} and {}'.format(self.cond1, cond2))
plt.tight_layout()
plt.savefig('{}_{}.png'.format(self.cond1, cond2))
plt.show()
plt.figure(figsize=(8, 6))
if percentages:
components = []
for i in H:
components.append([item/sum(i) for item in i])
plotHeatmap(np.array(components))
else:
plotHeatmap(H)
def nmf_transform(self, rss_data):
"""NMF transfomration"""
n_components = 2 # Reduce to 2 components for visualization
nmf = NMF(n_components=n_components, init='random', random_state=42)
embedding = nmf.fit_transform(rss_data)
H_matrix = nmf.components_
return(embedding[:, 0], embedding[:, 1], embedding, H_matrix)
def map_labels_to_tau(self):
"""Map tau values to labels, handling duplicates."""
for i, lbl in list(zip(self.sep, self.labels)):
if i in self.map_me: # Just in case we have the same tau value
prev_value = self.map_me[i]
print(lbl, "warning: has a dup")
self.map_me[i] = (prev_value, lbl)
else:
self.map_me[i] = lbl # Make the tau into a key
def sort_and_print_labels(self):
"""Sort tau values and print corresponding labels."""
self.sep = sorted(self.sep)
for i in self.sep:
print(self.map_me[i], i)
def init_order(self): ## how to construct the 3d embedding space
order = []
condition_label = []
## make sure self.spe is sorted
for i in sorted(list(set(self.sep))):
if isinstance(self.map_me[i], str):
#print("path A")
order.append(self.map_me[i])
#print(order)
else:
print("path B")
for i in self.map_me[i]:
order.append(i)
item = order[len(order) - 1]
order = order[:-1]
#print("getting rid of {}".format(item))
# remove one from the top, this is after order is stacked here
for i in order:
print(i)
condition_label.append([i] * len(self.label))
self.order = order # subtract the one here!
self.condition_label = np.array(condition_label).flatten()
def construct_3D_embedding(self, rawRSS = False):
X, Y, Z = [], [], []
def prepare_data(data_source):
""" Helper function to prepare the data for plotting """
nonlocal X, Y, Z
for i in self.order:
x = data_source[self.cond1].tolist() if rawRSS else self.NMF_embedd[i][0]
y = data_source[i].tolist() if rawRSS else self.NMF_embedd[i][1]
Z_val = [self.NMF_embedd[i][3]] * len(self.label)
X.extend(x)
Y.extend(y)
Z.extend(Z_val)
if rawRSS:
prepare_data(self.data)
else:
prepare_data(self.NMF_embedd)
# Convert lists to np.array and flatten them
X = np.array(X).flatten()
Y = np.array(Y).flatten()
Z = np.array(Z).flatten()
return(X, Y, Z)
def initDict(self, label):
dict = {}
for i in range(len(label)):
dict[i] = label[i]
return dict
def plot_3dEmbedding(self, rawRSS = False, regulonsToView = [], clustersToLabel = []):
condition_color_map = {}
for i in self.order: # create the color mappings!
color = f'#{random.randint(0, 0xFFFFFF):06x}'
condition_color_map[i] = color
X, Y, Z = self.construct_3D_embedding(rawRSS)
# Create the figure and 3D axis
fig = plt.figure(figsize=(15, 17))
ax = fig.add_subplot(111, projection='3d')
print(self.labels)
dict = self.initDict(self.label)
if len(clustersToLabel) == 0:
clustersToLabel = list(set(self.condition_label))
print(self.label)
print(condition_color_map)
if len(regulonsToView) > 0:
for i in range(len(X)):
item = dict[i % len(self.label)]
if item in regulonsToView and self.condition_label[i] in clustersToLabel:
ax.text(X[i], Y[i], Z[i], s=str(item),
color=condition_color_map[self.condition_label[i]],
fontsize=12, fontweight='bold', style='italic')
# Plot the data points for each condition
for condition in np.unique(self.condition_label):
mask = self.condition_label == condition
ax.scatter(X[mask], Y[mask], Z[mask], color=condition_color_map[condition], label=condition)
# Set the labels and title
ax.set_xlabel(f'RSS (control) {self.cond1}' if rawRSS else 'NMF_1')
ax.set_ylabel('RSS treatment' if rawRSS else 'NMF_2')
ax.set_zlabel("Kendall's Tau")
ax.set_title(f'3D Plot of {self.cond1} vs Conditions with Kendall\'s Tau using NMF reduction'
if not rawRSS else f"3D Plot of RSS scores {self.cond1} vs Conditions with Kendall\'s Tau")
# Add legend
ax.legend()
# Show the plot
plt.show()
def construct_tree(self):
"""Construct the entire tree of processing and print the result."""
self.compute_tau_and_kdtree()
self.map_labels_to_tau()
self.sort_and_print_labels()# this is how the 3D embedding will be assembled
self.init_order() # we are assemling the tree!
def plotRSS_NMF(self, cond2, drawQuadrants = True, include_pvals = False, alpha = 0.05, tryLabel = ""):
"""Visualize the 2D embeddings between control and treatment group"""
cond1x = self.NMF_embedd[cond2][0]
condy = self.NMF_embedd[cond2][1]
test1 = self.NMF_embedd[cond2][2]
p_vals = {}
plt.scatter(cond1x, condy)
def returnThreshold_dictionary(threshold_file, regulonC = None):
thresholds, regulonNames, regulonThresholds = None, [], []
if "3.5_" in threshold_file:
thresholds = pd.read_csv(threshold_file, sep="\t").T
regulonNames = list(map(lambda x: x.split(" ")[0], thresholds.loc["regulon"].tolist()))
regulonThresholds = thresholds.loc[regulonC] # by the row.
else:
thresholds = pd.read_csv(threshold_file)
regulonNames = list(map(lambda x: x.split(" ")[0], thresholds[regulonC]))
regulonThreshold = thresholds[regulonC] # column must be named like this ...
threshold_dict = {}
for i in range(len(regulonNames)):
threshold_dict[regulonNames[i]] = regulonThresholds[i]
i += 1
return threshold_dict
def rundifferential_test_AUC(df, metaClusterLabel, control, treatment, regulon_name, threshold_file, alpha = .05):
"""
Add a column to mark successful cells (greater than the mean enrichment score)
M is the total number of objects, (all the cells = total population (low and high regulon activity)
n is total number of Type I objects. (all cells that pass threshold)
The random variate represents the number of Type I objects
in N drawn without replacement from the total population.
k is the active amount of cells we are interested in
N the anumberof cells you are planning to sample from the tissue
sf(k, M, n, N, loc=0)
"""
df.columns = list(map(lambda x: x.split(" ")[0], df.columns))
if threshold_file is None:
calculate_mean = df[regulon_name].mean()
df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > df[regulon_name].mean()
else:
threshold = returnThreshold_dictionary(threshold_file, 'threshold')
#print(threshold[regulon_name], regulon_name)
df['is_successful_{}'.format(regulon_name)] = df[regulon_name] > threshold[regulon_name]
# Count successful control cells
control_successful_cells = df[(df[metaClusterLabel] == control) & (df['is_successful_{}'.format(regulon_name)])].shape[0]
# Count successful treatment cells
treatment_successful_cells = df[(df[metaClusterLabel] == treatment) & (df['is_successful_{}'.format(regulon_name)])].shape[0]
total_successful_cells = df[df['is_successful_{}'.format(regulon_name)]].shape[0]
total_population = df.shape[0]
n_1 = df[df[metaClusterLabel] == control].shape[0]
n_2 = df[df[metaClusterLabel] == treatment].shape[0]
p_value_1 = hypergeom.sf(control_successful_cells - 1, total_population, total_successful_cells, n_1)
p_value_2 = hypergeom.sf(treatment_successful_cells -1, total_population, total_successful_cells, n_2)
return(regulon_name, round(p_value_1, 2), round(p_value_2, 2))
def drawText(cond1x, condy):
i = 0
for x_2, y_2 in list(zip(cond1x, condy)):
text = self.label[i]
if include_pvals:
#rundifferential_test_AUC(df, control, treatment, regulon_name, alpha = .05)
min_font_size = 6
max_font_size = 12
chk = self.threshold_file
_, _, p_value_treatment= rundifferential_test_AUC(self.RAS_AUC,
self.metaDataCol,
self.cond1,
cond2,
text,
chk)
color = 'red' if p_value_treatment < alpha else 'black'
if color == 'black':
font_size = min_font_size
else:
font_size = max(min_font_size, min(max_font_size, max_font_size / (p_value_treatment * 5)))
plt.text(x_2, y_2, text, ha='center', va='bottom', fontsize= font_size, fontweight='bold', color=color) # the bigger the p_value, smaller text
p_vals[text] = p_value_treatment
else:
plt.text(x_2, y_2, text, ha='center', va='bottom', fontsize=6.5, fontweight='bold')
i += 1
drawText(cond1x, condy)
#plt.title("NMF Embedding of RSS Values {} and {}".format(self.cond1, cond2))
plt.xlabel("NMF_base ({})".format(self.cond1))
plt.ylabel("NMF_exp ({})".format(cond2))
if drawQuadrants:
plt.axvline(x = np.median(cond1x), color="black", linestyle="--")
plt.axhline(y = np.median(condy), color="black", linestyle="--")
plt.savefig('{}_{}.png'.format(self.cond1,cond2))
plt.show()
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(test1)
plt.scatter(cond1x, condy, c=kmeans.labels_, cmap='viridis')
t2 = drawText(cond1x, condy)
if drawQuadrants:
plt.axvline(x = np.median(cond1x), color="black", linestyle="--")
plt.axhline(y = np.median(condy), color="black", linestyle="--")
plt.title("K-Means Clustering after NMF {} and {}".format(self.cond1, cond2), fontsize=8.5)
plt.xlabel("NMF_base {}".format(self.cond1))
plt.ylabel("NMF_exp {}".format(cond2))
#plt.savefig('{}_{}.png'.format(self.cond1,cond2)) # Save as PNG
#plt.colorbar(label="Cluster Label")
plt.show()
return p_vals
InĀ [22]:
import warnings
warnings.filterwarnings("ignore")
InĀ [23]:
comparison2.order
Out[23]:
['Acute-Infected_blood_CD4+ CTL', 'Uninfected_blood_CD4+ CTL', 'Acute-Infected_brain_CD4+ CTL']
InĀ [24]:
comparison2.plot_3dEmbedding()
['Uninfected_blood_CD4+ CTL', 'Acute-Infected_blood_CD4+ CTL', 'Uninfected_brain_CD4+ CTL', 'Acute-Infected_brain_CD4+ CTL']
['RUNX2', 'ETS1', 'TFDP2', 'NR3C1', 'ELF1', 'BDP1', 'NRF1', 'TAF1', 'EP300', 'JUN', 'CREM', 'REL', 'FOS', 'ATF3', 'FOSL2', 'E2F3', 'TBX21', 'EOMES', 'IRF8', 'KLF10', 'BRCA1', 'E2F1', 'TFDP1', 'JUND', 'IRF9', 'RFX2', 'KLF8', 'MYB', 'CUX1', 'HIF1A', 'MYC', 'NFATC1', 'NFKB1', 'CHD2', 'JUNB', 'GABPA', 'YY1', 'SMARCA4', 'MAX', 'ATF7', 'ATF6', 'STAT2', 'IRF2', 'SREBF2', 'POLR2A', 'RELB', 'STAT5A', 'IRF1', 'IRF7', 'STAT1', 'RELA', 'PBX3', 'POLE3', 'ELF2']
{'Acute-Infected_blood_CD4+ CTL': '#64616e', 'Uninfected_blood_CD4+ CTL': '#aef150', 'Acute-Infected_brain_CD4+ CTL': '#f8bf8c'}
InĀ [25]:
regulon_sig2
Out[25]:
{'Uninfected_blood_CD4+ CTL': {'RUNX2': 0.0,
'ETS1': 0.49,
'TFDP2': 1.0,
'NR3C1': 1.0,
'ELF1': 1.0,
'BDP1': 1.0,
'NRF1': 1.0,
'TAF1': 1.0,
'EP300': 1.0,
'JUN': 1.0,
'CREM': 0.88,
'REL': 1.0,
'FOS': 0.99,
'ATF3': 1.0,
'FOSL2': 0.0,
'E2F3': 0.17,
'TBX21': 0.0,
'EOMES': 0.94,
'IRF8': 1.0,
'KLF10': 0.13,
'BRCA1': 1.0,
'E2F1': 1.0,
'TFDP1': 1.0,
'JUND': 0.98,
'IRF9': 0.02,
'RFX2': 0.45,
'KLF8': 0.98,
'MYB': 1.0,
'CUX1': 1.0,
'HIF1A': 1.0,
'MYC': 1.0,
'NFATC1': 0.9,
'NFKB1': 1.0,
'CHD2': 1.0,
'JUNB': 1.0,
'GABPA': 0.8,
'YY1': 0.49,
'SMARCA4': 1.0,
'MAX': 0.04,
'ATF7': 1.0,
'ATF6': 0.97,
'STAT2': 0.02,
'IRF2': 0.69,
'SREBF2': 1.0,
'POLR2A': 1.0,
'RELB': 1.0,
'STAT5A': 0.0,
'IRF1': 0.0,
'IRF7': 0.0,
'STAT1': 0.0,
'RELA': 0.0,
'PBX3': 1.0,
'POLE3': 1.0,
'ELF2': 0.51},
'Acute-Infected_blood_CD4+ CTL': {'RUNX2': 1.0,
'ETS1': 0.01,
'TFDP2': 1.0,
'NR3C1': 1.0,
'ELF1': 1.0,
'BDP1': 0.99,
'NRF1': 1.0,
'TAF1': 1.0,
'EP300': 1.0,
'JUN': 1.0,
'CREM': 0.0,
'REL': 1.0,
'FOS': 1.0,
'ATF3': 1.0,
'FOSL2': 0.0,
'E2F3': 1.0,
'TBX21': 0.0,
'EOMES': 1.0,
'IRF8': 1.0,
'KLF10': 0.23,
'BRCA1': 1.0,
'E2F1': 1.0,
'TFDP1': 1.0,
'JUND': 0.9,
'IRF9': 0.01,
'RFX2': 1.0,
'KLF8': 1.0,
'MYB': 1.0,
'CUX1': 1.0,
'HIF1A': 1.0,
'MYC': 1.0,
'NFATC1': 1.0,
'NFKB1': 1.0,
'CHD2': 1.0,
'JUNB': 1.0,
'GABPA': 1.0,
'YY1': 1.0,
'SMARCA4': 0.79,
'MAX': 0.77,
'ATF7': 1.0,
'ATF6': 1.0,
'STAT2': 0.96,
'IRF2': 1.0,
'SREBF2': 1.0,
'POLR2A': 1.0,
'RELB': 1.0,
'STAT5A': 1.0,
'IRF1': 0.15,
'IRF7': 0.98,
'STAT1': 0.07,
'RELA': 0.0,
'PBX3': 1.0,
'POLE3': 1.0,
'ELF2': 0.3},
'Uninfected_brain_CD4+ CTL': {'RUNX2': 0.0,
'ETS1': 1.0,
'TFDP2': 0.0,
'NR3C1': 0.0,
'ELF1': 0.0,
'BDP1': 0.0,
'NRF1': 0.2,
'TAF1': 1.0,
'EP300': 1.0,
'JUN': 1.0,
'CREM': 0.96,
'REL': 0.99,
'FOS': 0.0,
'ATF3': 0.0,
'FOSL2': 0.0,
'E2F3': 0.0,
'TBX21': 0.0,
'EOMES': 0.08,
'IRF8': 0.0,
'KLF10': 0.0,
'BRCA1': 1.0,
'E2F1': 1.0,
'TFDP1': 1.0,
'JUND': 0.89,
'IRF9': 0.99,
'RFX2': 1.0,
'KLF8': 1.0,
'MYB': 1.0,
'CUX1': 1.0,
'HIF1A': 1.0,
'MYC': 1.0,
'NFATC1': 1.0,
'NFKB1': 1.0,
'CHD2': 1.0,
'JUNB': 1.0,
'GABPA': 0.0,
'YY1': 1.0,
'SMARCA4': 0.03,
'MAX': 0.0,
'ATF7': 0.0,
'ATF6': 0.0,
'STAT2': 0.0,
'IRF2': 0.23,
'SREBF2': 0.0,
'POLR2A': 0.0,
'RELB': 1.0,
'STAT5A': 0.04,
'IRF1': 0.0,
'IRF7': 0.99,
'STAT1': 1.0,
'RELA': 1.0,
'PBX3': 1.0,
'POLE3': 0.01,
'ELF2': 0.0},
'Acute-Infected_brain_CD4+ CTL': {'RUNX2': 1.0,
'ETS1': 1.0,
'TFDP2': 1.0,
'NR3C1': 1.0,
'ELF1': 1.0,
'BDP1': 0.17,
'NRF1': 1.0,
'TAF1': 1.0,
'EP300': 1.0,
'JUN': 1.0,
'CREM': 0.0,
'REL': 0.51,
'FOS': 0.0,
'ATF3': 0.0,
'FOSL2': 0.0,
'E2F3': 0.16,
'TBX21': 0.0,
'EOMES': 1.0,
'IRF8': 1.0,
'KLF10': 0.0,
'BRCA1': 1.0,
'E2F1': 1.0,
'TFDP1': 1.0,
'JUND': 0.96,
'IRF9': 0.0,
'RFX2': 1.0,
'KLF8': 1.0,
'MYB': 1.0,
'CUX1': 1.0,
'HIF1A': 1.0,
'MYC': 1.0,
'NFATC1': 1.0,
'NFKB1': 1.0,
'CHD2': 1.0,
'JUNB': 1.0,
'GABPA': 0.0,
'YY1': 0.93,
'SMARCA4': 0.03,
'MAX': 0.01,
'ATF7': 1.0,
'ATF6': 0.79,
'STAT2': 0.16,
'IRF2': 0.43,
'SREBF2': 0.61,
'POLR2A': 0.0,
'RELB': 0.98,
'STAT5A': 1.0,
'IRF1': 0.0,
'IRF7': 0.99,
'STAT1': 0.03,
'RELA': 0.0,
'PBX3': 1.0,
'POLE3': 0.04,
'ELF2': 0.0}}
perm testing¶
InĀ [161]:
def plot(ax, cond1, cond2, z_vals, pdf): # we'll reuse this
ax.plot(z_vals, pdf)
ax.set_title("Kendall Tau Test Null Distribution between {} and {}".format(cond1, cond2))
ax.set_xlabel("statistic")
ax.set_ylabel("probability density")
def nullDist(data, control, treatment):
n = len(data[control])
var = (2*(2*n + 5)) / (9*n*(n - 1))
dist = stats.norm(scale=np.sqrt(var))
z_vals = np.linspace(-1.25, 1.25, 100)
pdf = dist.pdf(z_vals)
fig, ax = plt.subplots(figsize=(8, 5))
plot(ax, control, treatment, z_vals, pdf)
plt.show()
return dist, z_vals, pdf
InĀ [162]:
def final_permutation_Test(data, control, treatment):
dist, z_vals, pdf = nullDist(data, control, treatment)
x = data[control]
y = data[treatment]
def statistic(x): # explore all possible pairings by permuting `x`
return stats.kendalltau(x, y).statistic # ignore pvalue
ref = stats.permutation_test((x,), statistic,
permutation_type='pairings')
fig, ax = plt.subplots(figsize=(8, 5))
plot(ax, control, treatment, z_vals, pdf)
bins = np.linspace(-1.25, 1.25, 25)
ax.hist(ref.null_distribution, bins=bins, density=True)
ax.legend(['asymptotic approximation\n(many observations)',
'exact null distribution'])
plot(ax, control, treatment, z_vals, pdf)
plt.show()
return ref.pvalue
InĀ [311]:
final_permutation_Test(pbData,"Naive CD4 T", "CD14+ Mono")
Out[311]:
0.0038
InĀ [312]:
comparison2.compareLayers("Uninfected_brain_CD4+ CTL", "Acute-Infected_brain_CD4+ CTL", .045) ## with the control
InĀ [24]:
pbData.columns
Out[24]:
Index(['Unnamed: 0', 'Memory CD4 T', 'B', 'CD14+ Mono', 'NK', 'CD8 T',
'FCGR3A+ Mono', 'Naive CD4 T', 'DC', 'Platelet'],
dtype='object')
InĀ [11]:
pbData = pd.read_csv("PBMC_study/QA_QC_PBMC_rss_values_Feb3.csv")
df_pbmc_RAS = pd.read_csv("PBMC_study/obj_AUC_metadata_PBMC_2_Feb22.csv")
labels3 = pbData.columns[1:-1].tolist()
PBMC_cell_Types = [
"Naive CD4 T",
"FCGR3A+ Mono",
"CD14+ Mono",
"Memory CD4 T",
'DC',
'NK',
'B'
# "CD8 T"
]
comparisonPBMC = ComparisonTree("Naive CD4 T", df_pbmc_RAS, "cell_clusters", pbData, PBMC_cell_Types, "Unnamed: 0", "PBMC_study/3.5_AUCellThresholds_Info_PVMC_QA_QC.tsv")
comparisonPBMC.construct_tree()
regulon_sig_PB = {}
for i in PBMC_cell_Types:
p_vals = comparisonPBMC.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
regulon_sig_PB[i] = p_vals
comparisonPBMC.plot_3dEmbedding()
FCGR3A+ Mono -0.3754152823920266 CD14+ Mono -0.3023255813953488 DC -0.22480620155038758 NK 0.16500553709856036 B 0.46843853820598 Memory CD4 T 0.769656699889258 Naive CD4 T 0.9999999999999999 FCGR3A+ Mono CD14+ Mono DC NK B Memory CD4 T
['Naive CD4 T', 'FCGR3A+ Mono', 'CD14+ Mono', 'Memory CD4 T', 'DC', 'NK', 'B']
['SPIB', 'RFX5', 'IRF8', 'NFE2L2', 'IRF1', 'RELB', 'CEBPA', 'FOSL2', 'IRF7', 'NFE2', 'KLF4', 'SPI1', 'POLE3', 'STAT1', 'IRF2', 'TBL1XR1', 'MAX', 'ATF3', 'KLF11', 'ELF1', 'TRIM69', 'YY1', 'REL', 'KLF12', 'GTF2B', 'UTP18', 'KLF13', 'CTCF', 'THAP1', 'DBP', 'XBP1', 'TBX21', 'HOXB2', 'ATF1', 'ELK3', 'POLR2A', 'BCLAF1', 'ZNF143', 'ETS1', 'FLI1', 'KLF2', 'ELF2', 'ATF4']
{'FCGR3A+ Mono': '#f6405c', 'CD14+ Mono': '#13de99', 'DC': '#8579ca', 'NK': '#033f63', 'B': '#e96a19', 'Memory CD4 T': '#ecbb49'}
InĀ [13]:
pbData = pd.read_csv("PBMC_study/QA_QC_PBMC_rss_values_Feb3.csv")
df_pbmc_RAS = pd.read_csv("PBMC_study/obj_AUC_metadata_PBMC_2_Feb22.csv")
labels3 = pbData.columns[1:-1].tolist()
PBMC_cell_Types = [
"Naive CD4 T",
"FCGR3A+ Mono",
"CD14+ Mono",
"Memory CD4 T",
"CD8 T",
"NK",
"DC",
"B"
]
comparisonPBMC = ComparisonTree("Naive CD4 T", df_pbmc_RAS, "cell_clusters", pbData, PBMC_cell_Types, "Unnamed: 0", "PBMC_study/3.5_AUCellThresholds_Info_PVMC_QA_QC.tsv")
comparisonPBMC.construct_tree()
regulon_sig_PB = {}
for i in PBMC_cell_Types:
p_vals = comparisonPBMC.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
regulon_sig_PB[i] = p_vals
comparisonPBMC.plot_3dEmbedding()
FCGR3A+ Mono -0.3754152823920266 CD14+ Mono -0.3023255813953488 DC -0.22480620155038758 NK 0.16500553709856036 B 0.46843853820598 CD8 T 0.49058693244739754 Memory CD4 T 0.769656699889258 Naive CD4 T 0.9999999999999999 FCGR3A+ Mono CD14+ Mono DC NK B CD8 T Memory CD4 T
['Naive CD4 T', 'FCGR3A+ Mono', 'CD14+ Mono', 'Memory CD4 T', 'CD8 T', 'NK', 'DC', 'B']
['SPIB', 'RFX5', 'IRF8', 'NFE2L2', 'IRF1', 'RELB', 'CEBPA', 'FOSL2', 'IRF7', 'NFE2', 'KLF4', 'SPI1', 'POLE3', 'STAT1', 'IRF2', 'TBL1XR1', 'MAX', 'ATF3', 'KLF11', 'ELF1', 'TRIM69', 'YY1', 'REL', 'KLF12', 'GTF2B', 'UTP18', 'KLF13', 'CTCF', 'THAP1', 'DBP', 'XBP1', 'TBX21', 'HOXB2', 'ATF1', 'ELK3', 'POLR2A', 'BCLAF1', 'ZNF143', 'ETS1', 'FLI1', 'KLF2', 'ELF2', 'ATF4']
{'FCGR3A+ Mono': '#c63b25', 'CD14+ Mono': '#efef35', 'DC': '#814652', 'NK': '#0968c5', 'B': '#cff513', 'CD8 T': '#da04f6', 'Memory CD4 T': '#0f6ff9'}
InĀ [316]:
comparisonPBMC.plot_3dEmbedding(regulonsToView=["FOSL2", "RFX5", "CEBPA"], clustersToLabel=["CD14+ Mono", "FCGR3A+ Mono"])
['Naive CD4 T', 'FCGR3A+ Mono', 'CD14+ Mono', 'Memory CD4 T']
['SPIB', 'RFX5', 'IRF8', 'NFE2L2', 'IRF1', 'RELB', 'CEBPA', 'FOSL2', 'IRF7', 'NFE2', 'KLF4', 'SPI1', 'POLE3', 'STAT1', 'IRF2', 'TBL1XR1', 'MAX', 'ATF3', 'KLF11', 'ELF1', 'TRIM69', 'YY1', 'REL', 'KLF12', 'GTF2B', 'UTP18', 'KLF13', 'CTCF', 'THAP1', 'DBP', 'XBP1', 'TBX21', 'HOXB2', 'ATF1', 'ELK3', 'POLR2A', 'BCLAF1', 'ZNF143', 'ETS1', 'FLI1', 'KLF2', 'ELF2', 'ATF4']
{'FCGR3A+ Mono': '#04edb4', 'CD14+ Mono': '#d44efd', 'Memory CD4 T': '#83e710'}
InĀ [187]:
comparisonPBMC.compareLayers("Naive CD4 T", "Memory CD4 T", .015)
InĀ [166]:
data.columns
Out[166]:
Index(['Unnamed: 0', 'Acute-Infected_blood_CD4+ CTL',
'Acute-Infected_blood_CD4+ Naive', 'Acute-Infected_blood_CD4+ EM',
'Acute-Infected_blood_CD4+ Treg',
'Acute-Infected_blood_CD4+ Proliferating',
'Acute-Infected_brain_CD8+ Effector', 'Acute-Infected_brain_CD4+ CTL'],
dtype='object')
InĀ [104]:
acute_infected_cell_types
Out[104]:
['Acute-Infected_blood_CD4+ Naive', 'Acute-Infected_blood_CD4+ Treg', 'Acute-Infected_blood_CD4+ EM', 'Acute-Infected_blood_CD4+ Proliferating', 'Acute-Infected_blood_CD4+ CTL']
InĀ [163]:
data = pd.read_csv("SIV_monkey/rss_AcuteInfected.csv") ## this would be one comparison (RSS)
df_RAS = pd.read_csv("SIV_monkey/RAS_AUC_AcuteInfected.csv") ## grab this from your SCENIC stuff, include ALL METADATA AUC AND cellLabels
acute_infected_cell_types = [
"Acute-Infected_blood_CD4+ Naive",
"Acute-Infected_blood_CD4+ Treg",
"Acute-Infected_blood_CD4+ EM",
"Acute-Infected_blood_CD4+ Proliferating",
"Acute-Infected_blood_CD4+ CTL"
]
compariso3n = ComparisonTree('Acute-Infected_blood_CD4+ CTL', df_RAS, "SCENIC_RSS_3", data, acute_infected_cell_types, "Unnamed: 0", "SIV_monkey/3.5_AUCellThresholds_Info_Acute_infected.tsv")
compariso3n.construct_tree()
egulon_sig_PB = {}
for i in acute_infected_cell_types:
p_vals = compariso3n.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
egulon_sig_PB[i] = p_vals
Acute-Infected_blood_CD4+ Naive -0.49480519480519486 Acute-Infected_blood_CD4+ Treg -0.38701298701298703 Acute-Infected_blood_CD4+ EM -0.32597402597402597 Acute-Infected_blood_CD4+ Proliferating 0.4 Acute-Infected_blood_CD4+ CTL 1.0 Acute-Infected_blood_CD4+ Naive Acute-Infected_blood_CD4+ Treg Acute-Infected_blood_CD4+ EM Acute-Infected_blood_CD4+ Proliferating
InĀ [165]:
set(df_RAS["name"].tolist())
Out[165]:
{'93T', '94T', '95T'}
InĀ [40]:
bonf_list = [egulon_sig_PB[""][i] for i in egulon_sig_PB["Acute-Infected_blood_CD4+ Naive"]]
InĀ [103]:
data = pd.read_csv("SIV_monkey/rss_AcuteInfected.csv") ## this would be one comparison (RSS)
df_RAS = pd.read_csv("SIV_monkey/RAS_AUC_AcuteInfected.csv") ## grab this from your SCENIC stuff, include ALL METADATA AUC AND cellLabels
collect_types.init_types_collection()
acute_infected_cell_types = [
"Acute-Infected_blood_CD4+ Naive",
"Acute-Infected_blood_CD4+ Treg",
"Acute-Infected_blood_CD4+ EM",
"Acute-Infected_blood_CD4+ Proliferating",
"Acute-Infected_brain_CD4+ CTL",
"Acute-Infected_blood_CD4+ CTL"
]
compariso3n2 = ComparisonTree('Acute-Infected_blood_CD4+ CTL', df_RAS, "SCENIC_RSS_3", data, acute_infected_cell_types, "Unnamed: 0", "SIV_monkey/3.5_AUCellThresholds_Info_Acute_infected.tsv")
compariso3n2.construct_tree()
egulon_sig_PB = {}
for i in acute_infected_cell_types:
p_vals = compariso3n2.plotRSS_NMF(i, drawQuadrants=True, include_pvals=True)
egulon_sig_PB[i] = p_vals
Acute-Infected_blood_CD4+ Naive -0.49480519480519486 Acute-Infected_blood_CD4+ Treg -0.38701298701298703 Acute-Infected_blood_CD4+ EM -0.32597402597402597 Acute-Infected_blood_CD4+ Proliferating 0.4 Acute-Infected_brain_CD4+ CTL 0.625974025974026 Acute-Infected_blood_CD4+ CTL 1.0 Acute-Infected_blood_CD4+ Naive Acute-Infected_blood_CD4+ Treg Acute-Infected_blood_CD4+ EM Acute-Infected_blood_CD4+ Proliferating Acute-Infected_brain_CD4+ CTL
Bonferroni correction!¶
InĀ [149]:
cluster = "Acute-Infected_blood_CD4+ EM"
InĀ [150]:
bonf_lis= [egulon_sig_PB[cluster][i] for i in egulon_sig_PB[cluster]]
InĀ [151]:
from statsmodels.sandbox.stats.multicomp import multipletests
p_adjusted = multipletests(bonf_lis,alpha=0.01, method='bonferroni')
InĀ [152]:
for i, ele in list(zip(p_adjusted[0].tolist(), egulon_sig_PB[cluster].keys())):
if i:
print(ele)
FOS ZNF250 MXI1 CHD2 RFX1 STAT1
InĀ [109]:
len(compariso3n2.label)
Out[109]:
56
InĀ [298]:
compariso3n.plot_3dEmbedding(regulonsToView=["ZNF250", "MXI1"],
clustersToLabel=["Acute-Infected_blood_CD4+ Naive", "Acute-Infected_blood_CD4+ EM", "Acute-Infected_blood_CD4+ Treg"])
['Acute-Infected_blood_CD4+ Naive', 'Acute-Infected_blood_CD4+ Treg', 'Acute-Infected_blood_CD4+ EM', 'Acute-Infected_blood_CD4+ Proliferating', 'Acute-Infected_blood_CD4+ CTL']
['NFYB', 'BRCA1', 'E2F1', 'TFDP1', 'YY1', 'RELA', 'SP3', 'RELB', 'JUN', 'FOS', 'ZNF250', 'EP300', 'NFKB1', 'ATF6', 'TAF1', 'AHCTF1', 'JUNB', 'MXI1', 'CHD2', 'ELF1', 'ETS1', 'FLI1', 'NR3C1', 'RB1', 'TFDP2', 'IRF4', 'WRNIP1', 'ETV7', 'E2F3', 'RFX5', 'EOMES', 'TBX21', 'DDIT3', 'REL', 'JUND', 'CREM', 'FOSB', 'ATF3', 'NR2C2', 'SIN3A', 'ELF2', 'MLXIP', 'GABPA', 'ATF2', 'SP1', 'E2F4', 'PBX3', 'SMAD5', 'RFX1', 'RXRA', 'ELF4', 'ELK3', 'IRF2', 'IRF1', 'STAT1', 'IRF9']
{'Acute-Infected_blood_CD4+ Naive': '#bc9599', 'Acute-Infected_blood_CD4+ Treg': '#22add5', 'Acute-Infected_blood_CD4+ EM': '#b8d999', 'Acute-Infected_blood_CD4+ Proliferating': '#c88184'}
InĀ [313]:
a = final_permutation_Test(data, "Acute-Infected_brain_CD4+ CTL", "Acute-Infected_blood_CD4+ CTL")
print(a)
0.0002
InĀ [302]:
compariso3n2.compareLayers("Acute-Infected_blood_CD4+ CTL", "Acute-Infected_brain_CD4+ CTL", .05) ## this was just the Acute Infected
InĀ [175]:
import pandas as pd
from scipy.stats import kendalltau
# Assuming 'df' is your DataFrame with the relevant columns
columns = ["Acute-Infected_blood_CD4+ Naive", "Acute-Infected_blood_CD4+ EM", "Acute-Infected_blood_CD4+ Treg"]
# Initialize an empty dictionary to store Kendall Tau results
kendall_results = {}
# Nested for loop to calculate pairwise Kendall Tau
for col1 in columns:
kendall_results[col1] = {}
for col2 in columns:
tau, _ = kendalltau(data[col1], data[col2])
kendall_results[col1][col2] = tau
# Convert the dictionary to a pandas DataFrame for better readability
kendall_df = pd.DataFrame(kendall_results)
# Print the DataFrame, which you can copy and paste
print(kendall_df.to_string(index=True))
Acute-Infected_blood_CD4+ Naive Acute-Infected_blood_CD4+ EM Acute-Infected_blood_CD4+ Treg Acute-Infected_blood_CD4+ Naive 1.000000 0.631169 0.744156 Acute-Infected_blood_CD4+ EM 0.631169 1.000000 0.697403 Acute-Infected_blood_CD4+ Treg 0.744156 0.697403 1.000000